import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
# Ignoring warnings
import warnings
warnings.filterwarnings('ignore')
#Importing the dataset
#data = pd.read_csv('data.csv')
# url for dataset
url = 'https://raw.githubusercontent.com/Lakshit11/CDS-Assignment/main/drug.csv'
data = pd.read_csv(url)
# Getting first 5 rows of dataframe named data
data.head()
data_temp = data
data.isnull().sum()
Age 0 Sex 0 BP 0 Cholesterol 0 Na_to_K 0 Drug 0 dtype: int64
There are no null values in the data that has been imported.
print("Counts for Sex")
print(data.Sex.value_counts(),"\n")
print("Counts for BP")
print(data.BP.value_counts(),"\n")
print("Counts for Cholesterol")
print(data.Cholesterol.value_counts(),"\n")
print("Counts for Drugs")
print(data.Drug.value_counts(),"\n")
Counts for Sex M 104 F 96 Name: Sex, dtype: int64 Counts for BP HIGH 77 LOW 64 NORMAL 59 Name: BP, dtype: int64 Counts for Cholesterol HIGH 103 NORMAL 97 Name: Cholesterol, dtype: int64 Counts for Drugs DrugY 91 drugX 54 drugA 23 drugC 16 drugB 16 Name: Drug, dtype: int64
This shows that we have 5 types of drugs that can be prescribed to a patient.
# Age And Na_to_K
box=plt.boxplot([data['Age'],data['Na_to_K']])
box.items
<function dict.items>
From the above box plot we can infer that Age column has no outliers. Although we can observe that Na_to_K has few outliers, which can't be eliminated. As Na_to_K indicates Sodium to Potassium ratio in blood for a patient, these outliers may prove significant in detecting any underlying condition.
data = pd.get_dummies(data, columns = ['Sex', 'BP', 'Cholesterol'])
data.head()
| Age | Na_to_K | Drug | Sex_F | Sex_M | BP_HIGH | BP_LOW | BP_NORMAL | Cholesterol_HIGH | Cholesterol_NORMAL | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 23 | 25.355 | DrugY | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 47 | 13.093 | drugC | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 2 | 47 | 10.114 | drugC | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 3 | 28 | 7.798 | drugX | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
| 4 | 61 | 18.043 | DrugY | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
# Feature Matrix
X = data.drop('Drug',axis = 1)
# target vector
y = data['Drug']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
# Printing shape of each set
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(140, 9) (140,) (60, 9) (60,)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
LogisticRegression()
predictions = LR.predict(X_test)
predictions
array(['drugX', 'DrugY', 'drugX', 'drugC', 'DrugY', 'DrugY', 'DrugY',
'drugX', 'drugA', 'drugX', 'drugA', 'drugX', 'DrugY', 'drugA',
'drugB', 'DrugY', 'drugB', 'drugX', 'drugC', 'DrugY', 'drugB',
'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'drugC', 'drugX',
'DrugY', 'drugX', 'DrugY', 'drugC', 'DrugY', 'DrugY', 'drugB',
'DrugY', 'drugX', 'drugA', 'DrugY', 'drugA', 'drugX', 'drugX',
'drugX', 'DrugY', 'DrugY', 'drugC', 'DrugY', 'DrugY', 'drugB',
'drugX', 'drugX', 'DrugY', 'drugX', 'DrugY', 'drugX', 'DrugY',
'drugB', 'DrugY', 'DrugY', 'DrugY'], dtype=object)
coef = pd.DataFrame(LR.coef_, columns = ['Age', 'Na_to_K', 'Sex_F', 'Sex_M', 'BP_HIGH', 'BP_LOW', 'BP_NORMAL',
'Cholesterol_HIGH', 'Cholesterol_NORMAL'])
coef.head()
| Age | Na_to_K | Sex_F | Sex_M | BP_HIGH | BP_LOW | BP_NORMAL | Cholesterol_HIGH | Cholesterol_NORMAL | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.036629 | 0.614070 | -1.162761 | -0.785805 | -1.091015 | -0.789354 | -0.068197 | -1.194800 | -0.753766 |
| 1 | -0.045351 | -0.042745 | 0.380891 | 0.469552 | 2.305985 | -0.762510 | -0.693032 | 0.617132 | 0.233311 |
| 2 | 0.086629 | -0.287463 | 0.042548 | -0.477581 | 1.510311 | -0.970457 | -0.974887 | -0.381441 | -0.053592 |
| 3 | -0.005366 | -0.139016 | -0.027256 | 0.391246 | -0.793075 | 1.818116 | -0.661052 | 1.258769 | -0.894780 |
| 4 | 0.000718 | -0.144845 | 0.766578 | 0.402588 | -1.932207 | 0.704205 | 2.397167 | -0.299661 | 1.468827 |
print(LR.intercept_)
[-2.48490453 1.06664624 -0.53440027 0.4407133 1.51194526]
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))
precision recall f1-score support
DrugY 0.96 0.96 0.96 26
drugA 1.00 0.71 0.83 7
drugB 0.50 1.00 0.67 3
drugC 1.00 0.83 0.91 6
drugX 1.00 1.00 1.00 18
accuracy 0.93 60
macro avg 0.89 0.90 0.87 60
weighted avg 0.96 0.93 0.94 60
# Accuracy
from sklearn.metrics import accuracy_score
acc_sc_LR = accuracy_score(y_test, predictions)*100
print('Accuracy of the developed Logistic Regression Model is:', acc_sc_LR,'%')
Accuracy of the developed Logistic Regression Model is: 93.33333333333333 %
# Splitting data again for SVM
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, random_state=0)
from sklearn.svm import SVC
sv = SVC()
# fitting the model
sv.fit(X_train1, y_train1)
SVC()
sv_pred = sv.predict(X_test1)
sv_pred
array(['drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'drugX', 'drugX',
'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY',
'drugX', 'drugX', 'DrugY', 'DrugY', 'drugX', 'DrugY', 'DrugY',
'drugX', 'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'DrugY',
'DrugY', 'drugX', 'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY',
'DrugY', 'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY', 'drugX',
'DrugY', 'drugX', 'DrugY', 'drugX', 'DrugY', 'DrugY', 'drugX',
'DrugY'], dtype=object)
data_temp['BP'] = data_temp['BP'].replace({'HIGH':3,'NORMAL':2,'LOW':1})
data_temp.head()
| Age | Sex | BP | Cholesterol | Na_to_K | Drug | |
|---|---|---|---|---|---|---|
| 0 | 23 | F | 3 | HIGH | 25.355 | DrugY |
| 1 | 47 | M | 1 | HIGH | 13.093 | drugC |
| 2 | 47 | M | 1 | HIGH | 10.114 | drugC |
| 3 | 28 | F | 2 | HIGH | 7.798 | drugX |
| 4 | 61 | F | 1 | HIGH | 18.043 | DrugY |
# plot
fig = px.scatter(data_temp, x='Age', y = 'Na_to_K',color = 'Drug', size = 'BP')
fig.show()
acc_sc_svm = accuracy_score(y_test1, sv_pred)*100
print('Accuracy of SVM is:', acc_sc_svm,'%')
Accuracy of SVM is: 82.0 %
from sklearn.neighbors import KNeighborsClassifier
# A Dictionary to store key as the number of neighbors and value as the accuracy of the model
acc = {}
for i in range(1,60):
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(X_test, y_test)
acc[i]=(100*knn.score(X_test, y_test))
import matplotlib.pyplot as plt
xx = list(acc.keys())
yy = list(acc.values())
plt.xlabel("Number of neighbors")
plt.ylabel("Accuracy")
plt.plot(xx, yy)
[<matplotlib.lines.Line2D at 0x2a4143fff10>]
From the above plot we can see that the model accuracy behaves abnormally near the extremes, it's either overfitted or underfitted. The ideal number of neighbors that should be considered will lies between 20-25.
pos = xx[20:40][yy.index(np.max(yy[20:40]))]
print('The best accuracy that KNN can provide after excluding the extreme cases is: ', np.max(yy[20:40]),'and the optimum number of neighbors is: ',pos)
The best accuracy that KNN can provide after excluding the extreme cases is: 65.0 and the optimum number of neighbors is: 27
print('Accuracy score for Logistic Regression is: ', acc_sc_LR,'%')
print('Accuracy score for Support Vector Machine(SVM) algorithm is: ',acc_sc_svm,'%')
print('The average accuracy of KNN Algorithm in our case is: ', np.mean(yy))
Accuracy score for Logistic Regression is: 93.33333333333333 % Accuracy score for Support Vector Machine(SVM) algorithm is: 82.0 % The average accuracy of KNN Algorithm in our case is: 55.225988700564976